#define ALIGN(I, A) (((I)+(A)-1)/(A)*(A))
! (C) Copyright 2000- ECMWF.
! (C) Copyright 2000- Meteo-France.
! (C) Copyright 2022- NVIDIA.
! 
! This software is licensed under the terms of the Apache Licence Version 2.0
! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
! In applying this licence, ECMWF does not waive the privileges and immunities
! granted to it by virtue of its status as an intergovernmental organisation
! nor does it submit to any jurisdiction.
!

MODULE SUMP_TRANS_MOD
CONTAINS
SUBROUTINE SUMP_TRANS

! Set up distributed environment for the transform package (part 2)

! Modifications :
! P.Marguinaud : 11-Sep-2012 : Fix twice allocated pointer

USE EC_PARKIND      ,ONLY : JPIM     ,JPRD

USE TPM_GEN         ,ONLY : NOUT, NPRINTLEV
USE TPM_DIM         ,ONLY : R
USE TPM_GEOMETRY    ,ONLY : G
USE TPM_DISTR       ,ONLY : D, LEQ_REGIONS, MYSETW, NPRTRNS, NPRTRW, NPROC, MYPROC

USE SUMPLATF_MOD    ,ONLY : SUMPLATF
USE SUMPLAT_MOD     ,ONLY : SUMPLAT
USE SUSTAONL_MOD    ,ONLY : SUSTAONL
USE MYSENDSET_MOD   ,ONLY : MYSENDSET
USE MYRECVSET_MOD   ,ONLY : MYRECVSET
USE EQ_REGIONS_MOD  ,ONLY : MY_REGION_NS, MY_REGION_EW,           &
      &                      N_REGIONS, N_REGIONS_EW, N_REGIONS_NS
  
!

IMPLICIT NONE

INTEGER(KIND=JPIM) :: JM
INTEGER(KIND=JPIM) :: JGL,IGL,IPLAT,ISENDSET,IRECVSET,JML,IPOS,IM
INTEGER(KIND=JPIM) :: IGPTOT,IMEDIAP,IRESTM,JA,JB,IOFF,OFFSET1,OFFSET2,KMLOC,KM
INTEGER(KIND=JPIM),ALLOCATABLE :: IGPTOTL(:,:)

REAL(KIND=JPRD),ALLOCATABLE :: ZDUM(:)
REAL(KIND=JPRD) :: ZMEDIAP

LOGICAL    :: LLP1,LLP2

!     ------------------------------------------------------------------


LLP1 = NPRINTLEV>0
LLP2 = NPRINTLEV>1
IF(LLP1) WRITE(NOUT,*) '=== ENTER ROUTINE SUMP_TRANS ==='

IF(.NOT.D%LGRIDONLY) THEN
  ALLOCATE(D%NULTPP(NPRTRNS))
  IF(LLP2)WRITE(NOUT,9) 'D%NULTPP   ',SIZE(D%NULTPP   ),SHAPE(D%NULTPP   )
  ALLOCATE(D%NPTRLS(NPRTRNS))
  IF(LLP2)WRITE(NOUT,9) 'D%NPTRLS   ',SIZE(D%NPTRLS   ),SHAPE(D%NPTRLS   )
  ALLOCATE(D%NPROCL(R%NDGL))
  IF(LLP2)WRITE(NOUT,9) 'D%NPROCL   ',SIZE(D%NPROCL   ),SHAPE(D%NPROCL   )

  CALL SUMPLATF(R%NDGL,NPRTRNS,MYSETW,D%NULTPP,D%NPROCL,D%NPTRLS)
  D%NDGL_FS = D%NULTPP(MYSETW)

! Help arrays for spectral to fourier space transposition
  ALLOCATE(D%NLTSGTB (NPRTRNS+1))
  IF(LLP2)WRITE(NOUT,9) 'D%NLTSGTB ',SIZE(D%NLTSGTB),SHAPE(D%NLTSGTB)
  ALLOCATE(D%NLTSFTB (NPRTRNS+1))
  IF(LLP2)WRITE(NOUT,9) 'D%NLTSFTB ',SIZE(D%NLTSFTB),SHAPE(D%NLTSFTB)
  ALLOCATE(D%NSTAGT0B(NPRTRNS+1))
  IF(LLP2)WRITE(NOUT,9) 'D%NSTAGT0B ',SIZE(D%NSTAGT0B),SHAPE(D%NSTAGT0B)
  ALLOCATE(D%NSTAGT1B(NPRTRNS+1))
  IF(LLP2)WRITE(NOUT,9) 'D%NSTAGT1B ',SIZE(D%NSTAGT1B),SHAPE(D%NSTAGT1B)
  ALLOCATE(D%MSTABF  (NPRTRNS+1))
  IF(LLP2)WRITE(NOUT,9) 'D%MSTABF ',SIZE(D%MSTABF),SHAPE(D%MSTABF)

  D%NLTSGTB(:) = 0
  DO JGL=1,D%NDGL_FS
    IGL = D%NPTRLS(MYSETW)+JGL-1
    DO JM=0,G%NMEN(IGL)
      D%NLTSGTB(D%NPROCM(JM)) = D%NLTSGTB(D%NPROCM(JM))+1
    ENDDO
  ENDDO
  DO JA=1,NPRTRW
    IPLAT = 0
    DO JGL=1,D%NULTPP(JA)
      IGL = D%NPTRLS(JA)+JGL-1
      DO JM=1,D%NUMP
        IF(IGL > R%NDGNH-G%NDGLU(D%MYMS(JM)) .AND. IGL <= R%NDGNH+G%NDGLU(D%MYMS(JM))) THEN
          IPLAT = IPLAT + 1
        ENDIF
      ENDDO
    ENDDO
    D%NLTSFTB(JA) = IPLAT
  ENDDO
  
  DO JA=1,NPRTRW-1
    ISENDSET = MYSENDSET(NPRTRW,MYSETW,JA)
    IRECVSET = MYRECVSET(NPRTRW,MYSETW,JA)
    D%MSTABF(IRECVSET) = ISENDSET
  ENDDO
  D%MSTABF(MYSETW) = MYSETW
 
  ALLOCATE(D%NPNTGTB0(0:R%NSMAX,D%NDGL_FS))
  IF(LLP2)WRITE(NOUT,9) 'D%NPNTGTB0 ',SIZE(D%NPNTGTB0 ),SHAPE(D%NPNTGTB0 )
  ALLOCATE(D%NPNTGTB1(D%NUMP,R%NDGL))
  IF(LLP2)WRITE(NOUT,9) 'D%NPNTGTB1 ',SIZE(D%NPNTGTB1 ),SHAPE(D%NPNTGTB1 )

  ! Global offsets of processors
  D%NSTAGT0B(1) = 0
  D%NSTAGT1B(1) = 0
  DO JA=2,NPRTRNS
    D%NSTAGT0B(JA) = D%NSTAGT0B(JA-1)+D%NLTSGTB(JA-1)
    D%NSTAGT1B(JA) = D%NSTAGT1B(JA-1)+D%NLTSFTB(JA-1)
  ENDDO

  ! Global size of foubuf
  D%NLENGT0B = D%NSTAGT0B(NPRTRNS)+D%NLTSGTB(NPRTRNS)
  D%NLENGT1B = D%NSTAGT1B(NPRTRNS)+D%NLTSFTB(NPRTRNS)

  ! Global offsets of grid points
  DO JA=1,NPRTRW
    IPOS = 0
    DO JGL=1,D%NULTPP(MYSETW)
      IGL = D%NPTRLS(MYSETW) + JGL - 1
      DO JML=D%NPTRMS(JA),D%NPTRMS(JA)+D%NUMPP(JA)-1
        IM = D%NALLMS(JML)
        IF (IM  <=  G%NMEN(IGL)) THEN
          D%NPNTGTB0(IM,JGL) = D%NSTAGT0B(D%NPROCM(IM)) + IPOS
          IPOS = IPOS+1
        ELSE
          D%NPNTGTB0(IM,JGL) = -99
        ENDIF
      ENDDO
    ENDDO
  ENDDO

  DO JA=1,NPRTRW
    IPOS = 0
    DO JGL=1,D%NULTPP(JA)
      IGL = D%NPTRLS(JA) + JGL - 1
      DO JM=1,D%NUMP
        IM = D%MYMS(JM)
        IF (IM  <=  G%NMEN(IGL)) THEN
          D%NPNTGTB1(JM,IGL) = D%NSTAGT1B(D%NPROCL(IGL)) + IPOS
          IPOS = IPOS+1
        ELSE
          D%NPNTGTB1(JM,IGL) = -99
        ENDIF
      ENDDO
    ENDDO
  ENDDO

  ! D%NSTAGT0B / D%NSTAGT1B: offset of peer rank in send/recv buffer
  ! D%NLTSGTB  / D%NLTSFTB : size of peer rank in send/recv buffer
  ! D%NPNTGTB0 / D%NPNTGTB1: translation inp to global send buffer / recv to out buffer
ENDIF
  
! GRIDPOINT SPACE

ALLOCATE(D%NFRSTLAT(N_REGIONS_NS))
IF(LLP2)WRITE(NOUT,9) 'D%NFRSTLAT ',SIZE(D%NFRSTLAT ),SHAPE(D%NFRSTLAT )
ALLOCATE(D%NLSTLAT(N_REGIONS_NS))
IF(LLP2)WRITE(NOUT,9) 'D%NLSTLAT  ',SIZE(D%NLSTLAT  ),SHAPE(D%NLSTLAT  )
ALLOCATE(D%NPTRLAT(R%NDGL))
IF(LLP2)WRITE(NOUT,9) 'D%NPTRLAT  ',SIZE(D%NPTRLAT  ),SHAPE(D%NPTRLAT  )
ALLOCATE(D%NPTRFRSTLAT(N_REGIONS_NS))
IF(LLP2)WRITE(NOUT,9) 'D%NPTRFRSTLAT',SIZE(D%NPTRFRSTLAT),SHAPE(D%NPTRFRSTLAT)
ALLOCATE(D%NPTRLSTLAT(N_REGIONS_NS))
IF(LLP2)WRITE(NOUT,9)'D%NPTRLSTLAT',SIZE(D%NPTRLSTLAT),SHAPE(D%NPTRLSTLAT)
ALLOCATE(D%LSPLITLAT(R%NDGL))
IF(LLP2)WRITE(NOUT,9) 'D%LSPLITLAT',SIZE(D%LSPLITLAT),SHAPE(D%LSPLITLAT)
ALLOCATE(D%NPROCA_GP(N_REGIONS_NS))
IF(LLP2)WRITE(NOUT,9) 'D%NPROCA_GP',SIZE(D%NPROCA_GP),SHAPE(D%NPROCA_GP)


IF(.NOT.D%LWEIGHTED_DISTR) THEN
  ALLOCATE(ZDUM(1))
  CALL SUMPLAT(R%NDGL,NPROC,N_REGIONS_NS,MY_REGION_NS,D%LSPLIT,LEQ_REGIONS,&
             &D%NFRSTLAT,D%NLSTLAT,D%NFRSTLOFF,D%NPTRLAT,&
             &D%NPTRFRSTLAT,D%NPTRLSTLAT,D%NPTRFLOFF,&
             &ZDUM,D%LWEIGHTED_DISTR,ZMEDIAP,D%NPROCA_GP,&
             &IMEDIAP,IRESTM,D%LSPLITLAT,MYPROC,G%NLOEN)
ELSE
  CALL SUMPLAT(R%NDGL,NPROC,N_REGIONS_NS,MY_REGION_NS,D%LSPLIT,LEQ_REGIONS,&
             &D%NFRSTLAT,D%NLSTLAT,D%NFRSTLOFF,D%NPTRLAT,&
             &D%NPTRFRSTLAT,D%NPTRLSTLAT,D%NPTRFLOFF,&
             &D%RWEIGHT,D%LWEIGHTED_DISTR,ZMEDIAP,D%NPROCA_GP,&
             &IMEDIAP,IRESTM,D%LSPLITLAT,MYPROC,G%NLOEN)
ENDIF
D%NDGL_GP = D%NLSTLAT(MY_REGION_NS)-D%NFRSTLOFF

IF (LLP1) THEN
  IF(.NOT.D%LGRIDONLY) THEN
    WRITE(NOUT,FMT='(/'' OUTPUT FROM ROUTINE SUMPLAT: ''/)')
    WRITE(NOUT,FMT='('' D%NULTPP '')')
    WRITE(NOUT,FMT='(20(1X,I4))') D%NULTPP(1:NPRTRNS)
    WRITE(NOUT,FMT='('' D%NPROCL '')')
    WRITE(NOUT,FMT='(20(1X,I4))') D%NPROCL(1:R%NDGL)
  ENDIF
  WRITE(NOUT,FMT='('' D%NFRSTLAT '')')
  WRITE(NOUT,FMT='(20(1X,I4))') D%NFRSTLAT(1:N_REGIONS_NS)
  WRITE(NOUT,FMT='('' D%NLSTLAT '')')
  WRITE(NOUT,FMT='(20(1X,I4))') D%NLSTLAT(1:N_REGIONS_NS)
  WRITE(NOUT,FMT='('' D%NFRSTLOFF  D%NPTRFLOFF '')')
  WRITE(NOUT,FMT='(2(1X,I6))') D%NFRSTLOFF, D%NPTRFLOFF
  WRITE(NOUT,FMT='('' D%NPTRLAT '')')
  WRITE(NOUT,FMT='(20(1X,I4))') D%NPTRLAT(1:R%NDGL)
  WRITE(NOUT,FMT='('' D%LSPLITLAT '')')
  WRITE(NOUT,FMT='(50(1X,L1))') D%LSPLITLAT(1:R%NDGL)
  WRITE(NOUT,FMT='('' D%NPTRFRSTLAT '')')
  WRITE(NOUT,FMT='(20(1X,I4))') D%NPTRFRSTLAT(1:N_REGIONS_NS)
  WRITE(NOUT,FMT='('' D%NPTRLSTLAT '')')
  WRITE(NOUT,FMT='(20(1X,I4))') D%NPTRLSTLAT(1:N_REGIONS_NS)
  WRITE(NOUT,FMT='(/)')
ENDIF
ALLOCATE(D%NSTA(R%NDGL+N_REGIONS_NS-1,N_REGIONS_EW))
IF(LLP2)WRITE(NOUT,9) 'D%NSTA     ',SIZE(D%NSTA     ),SHAPE(D%NSTA     )
ALLOCATE(D%NONL(R%NDGL+N_REGIONS_NS-1,N_REGIONS_EW))
IF(LLP2)WRITE(NOUT,9) 'D%NONL     ',SIZE(D%NONL     ),SHAPE(D%NONL     )

IF(.NOT.D%LWEIGHTED_DISTR) THEN
  CALL SUSTAONL(IMEDIAP,IRESTM,D%LWEIGHTED_DISTR,ZDUM,ZMEDIAP,D%NPROCA_GP)
ELSE
  CALL SUSTAONL(IMEDIAP,IRESTM,D%LWEIGHTED_DISTR,D%RWEIGHT,ZMEDIAP,D%NPROCA_GP)
ENDIF
!  IGPTOTL is the number of grid points in each individual processor
ALLOCATE(IGPTOTL(N_REGIONS_NS,N_REGIONS_EW))
IGPTOTL(:,:)=0

DO JA=1,N_REGIONS_NS
  DO JB=1,N_REGIONS(JA)
    IGPTOT = 0
    DO JGL=D%NPTRFRSTLAT(JA),D%NPTRLSTLAT(JA)
      IGPTOT = IGPTOT+D%NONL(JGL,JB)
    ENDDO
    IGPTOTL(JA,JB) = IGPTOT
  ENDDO
ENDDO
D%NGPTOT = IGPTOTL(MY_REGION_NS,MY_REGION_EW)
D%NGPTOTMX = MAXVAL(IGPTOTL)
D%NGPTOTG = SUM(IGPTOTL)
ALLOCATE(D%NGPTOTL(N_REGIONS_NS,N_REGIONS_EW))
IF(LLP2)WRITE(NOUT,9) 'D%NGPTOTL     ',SIZE(D%NGPTOTL ),SHAPE(D%NGPTOTL  )
D%NGPTOTL(:,:) = IGPTOTL(:,:)

IF(.NOT.D%LGRIDONLY) THEN
  ALLOCATE(D%NSTAGTF(D%NDGL_FS+1))
  IF(LLP2)WRITE(NOUT,9) 'D%NSTAGTF     ',SIZE(D%NSTAGTF ),SHAPE(D%NSTAGTF  )
  IOFF = 0
  DO JGL=1,D%NDGL_FS
    D%NSTAGTF(JGL) = IOFF
    IGL = D%NPTRLS(MYSETW) + JGL - 1
    ! Each latitude should be able to store NLON real values, or floor(NLON/2)+1
    ! complex values. Note that IOFF should always be even, because we need to
    ! store complex values (i.e. 2 floats), but this is the case anyway.
    ! WARNING: Extra padding changes results, potentially, though it does not
    ! cause wrong results.
    IOFF = IOFF + (G%NLOEN(IGL)/2+1)*2
  ENDDO
  D%NSTAGTF(D%NDGL_FS+1) = IOFF
  D%NLENGTF = IOFF
ENDIF

IF(ALLOCATED(ZDUM)) DEALLOCATE(ZDUM)
DEALLOCATE(IGPTOTL)

ALLOCATE(D%OFFSETS_GEMM1(D%NUMP+1))
ALLOCATE(D%OFFSETS_GEMM2(D%NUMP+1))

OFFSET1 = 0
OFFSET2 = 0
DO KMLOC=1,D%NUMP
  KM = D%MYMS(KMLOC)
  D%OFFSETS_GEMM1(KMLOC) = OFFSET1
  D%OFFSETS_GEMM2(KMLOC) = OFFSET2

  !KM=0 is transformed in double precision, no need to store here
  IF (KM /= 0) THEN
    OFFSET1 = OFFSET1 + ALIGN(G%NDGLU(KM),8)
    ! N_OFFSET takes the max of the two GEMMs
    OFFSET2 = OFFSET2 + ALIGN((R%NSMAX-KM+3)/2,8)
  ENDIF
ENDDO
D%OFFSETS_GEMM1(D%NUMP+1) = OFFSET1
D%OFFSETS_GEMM2(D%NUMP+1) = OFFSET2

!     ------------------------------------------------------------------
9 FORMAT(1X,'ARRAY ',A10,' ALLOCATED ',8I8)

END SUBROUTINE SUMP_TRANS
END MODULE SUMP_TRANS_MOD

